data(us.cities)
# Get major cities for each sample region (state)
.states <- c("OR", "VT", "CO", "NC")
top.cities <- purrr::map_df(.states, function(s) {
out <- us.cities %>%
filter(country.etc==s) %>%
mutate(city = gsub(paste0(" ", s), "", name)) %>%
arrange(-pop)
if (s == "OR") {
out <- out %>%
head() %>%
filter(!(city %in% c("Gresham", "Hillsboro", "Corvallis",
"Beaverton", "Springfield")))
} else if (s == "CO") {
out <- out %>%
head() %>%
filter(!(city %in% c("Thornton", "Lakewood", "Aurora")))
} else if (s == "NC") {
out <- out %>%
head() %>%
filter(!(city %in% c("Greensboro", "Durham", "Fayetteville")))
} else {
out <- out %>% head()
}
out
})
# Load the map data
states <- map_data("state") %>%
filter(region %in% c("oregon", "north carolina", "colorado", "vermont"))
# Load your data
data.files <- list.files("data/final", full.names = T)
df <- purrr::map_df(data.files, readRDS)
caps.after.ws <- function(string) {
gsub("(?<=\\s)([a-z])", "\\U\\1", string, perl = T)
}
# Define a function to create a plot for each species
plot.for.species <- function(spec, st.abbr) {
st <- case_when(st.abbr == "CO" ~ "colorado",
st.abbr == "NC" ~ "north carolina",
st.abbr == "VT" ~ "vermont",
st.abbr == "OR" ~ "oregon",
T ~ "")
title <- caps.after.ws(paste(st.abbr, gsub("_", " ", spec),
"Observations, 2016-2019"))
p <- ggplot(data = states %>% filter(region == st)) +
geom_polygon(aes(x = long, y = lat, group = group),
fill = "#989875", color = "black") +
geom_point(data = df %>% filter(state == st.abbr & common.name == spec),
aes(x = lon, y = lat),
size=1, alpha=.5, fill = "red", shape=21) +
geom_point(data = top.cities %>% filter(country.etc == st.abbr),
aes(x=long, y=lat),
fill="gold", color="black", size=3.5, shape = 21) +
geom_text(data = top.cities %>% filter(country.etc == st.abbr),
aes(x=long, y=lat, label=city),
color="white", hjust=case_when(st.abbr=="NC"~.2,
st.abbr=="VT"~.65,
T~.5),
vjust=ifelse(st.abbr=="NC", -.65, 1.5),
size=4) +
coord_map() +
ggtitle(title) +
theme_minimal() +
theme(panel.background = element_blank(),
axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank())
data.table(
state=st.abbr,
species=spec,
plot=list(p)
)
}
spec.state <- expand.grid(unique(df$common.name), unique(df$state)) %>%
rename(spec=Var1, st.abbr=Var2)
# Create a list of plots
plots <- purrr::map2_df(spec.state$spec,
spec.state$st.abbr,
~plot.for.species(.x, .y))
# Plot Ruddy Duck plots
do.call(ggpubr::ggarrange,
c(plots[species == "Ruddy Duck"]$plot,
list(nrow=2, ncol=2)))
# Plot Belted Kingfisher plots
do.call(ggpubr::ggarrange,
c(plots[species == "Belted Kingfisher"]$plot,
list(nrow=2, ncol=2)))
# Plot Wild Turkey plots
do.call(ggpubr::ggarrange,
c(plots[species == "Wild Turkey"]$plot,
list(nrow=2, ncol=2)))
# Plot Sharp-Shinned Hawk plots
do.call(ggpubr::ggarrange,
c(plots[species == "Sharp-shinned Hawk"]$plot,
list(nrow=2, ncol=2)))
# Plot Downy Woodpecker Plots
do.call(ggpubr::ggarrange,
c(plots[species == "Downy Woodpecker"]$plot,
list(nrow=2, ncol=2)))
# Plot Cedar Waxwing Plots
do.call(ggpubr::ggarrange,
c(plots[species == "Cedar Waxwing"]$plot,
list(nrow=2, ncol=2)))
# Plot Sandhill Crane Plots
do.call(ggpubr::ggarrange,
c(plots[species == "Sandhill Crane"]$plot,
list(nrow=2, ncol=2)))
# Plot Sanderling Plots
do.call(ggpubr::ggarrange,
c(plots[species == "Sanderling"]$plot,
list(nrow=2, ncol=2)))
states <- c("CO", "NC", "OR", "VT")
r.files <- paste0("data/final_rasters/", states, ".tif")
r.list <- purrr::map(r.files, rast)
names(r.list) <- states
TODO: - terra::freq - terra::density -
terra::layerCor
r.df <- map_df(states, function(s) {
df <- r.list[[s]] %>% as.data.frame()
names(df) <- names(df) %>% gsub(paste0("_", s), "", .)
df %>% setDT()
df[, state := factor(s, levels=states)]
df[apply(df, 1, function(.x) !any(is.na(.x)))]
})
# Custom function to process factor levels
clean.levels <- function(x) {
# Remove non-alphanumeric characters and replace with underscores
x <- gsub("[^a-zA-Z0-9]", "_", x)
# Convert to lowercase
x <- tolower(x)
# Remove any leading or trailing underscores
x <- gsub("^_|_$", "", x)
x <- gsub("__", "_", x)
x <- gsub("NLCD_Land_", "", x)
return(x)
}
r.df[, NLCD_Land := factor(clean.levels(levels(NLCD_Land))[NLCD_Land])]
# Convert factor columns to dummy variables
df.dummies <- data.table(model.matrix(~ . - 1, data = r.df[, .(NLCD_Land, state)])) %>%
cbind(r.df[, -which(names(r.df) %in% c("NLCD_Land", "state")), with=F])
names(df.dummies) <- gsub("NLCD_Land", "", names(df.dummies))
# Ensure that there is more than one value per column (remove otherwise)
uniq.1 <- t( df.dummies[, lapply(.SD, uniqueN)]) %>%
as.data.frame() %>%
filter(V1 == 1) %>%
row.names()
if (length(uniq.1) >= 1) {
df.dummies <- df.dummies[, -which(names(df.dummies) %in% uniq.1), with=F]
}
pca.fit <- PCA(df.dummies, graph=F)
plot.PCA(pca.fit, choix="var")
res <- pca.fit$var$coord %>%
as.data.frame() %>%
mutate(var=as.factor(rownames(.))) %>%
select(var, everything()) %>%
as_tibble()
rownames(res) <- NULL
p.d1 <- ggplot(res, aes(x = reorder(var, Dim.1), y = Dim.1)) +
geom_bar(stat = "identity", fill="darkblue") +
coord_flip() + # Makes it a horizontal bar chart
labs(title = "Variable importance for Dim.1", y = "Importance", x = "Variable") +
theme_minimal()
p.d2 <- ggplot(res, aes(x = reorder(var, Dim.2), y = Dim.2)) +
geom_bar(stat = "identity", fill="darkred") +
coord_flip() + # Makes it a horizontal bar chart
labs(title = "Variable importance for Dim.2", y = "Importance", x = "Variable") +
theme_minimal()
ggpubr::ggarrange(plotlist=list(p.d1, p.d2), nrow=2, ncol=1)
famd.fit <- FAMD(r.df, graph=F)
ggpubr::ggarrange(plotlist=purrr::map(
c("var", "quanti", "quali"),
~plot.FAMD(famd.fit, choix=.x)),
ncol=1, nrow=3)
res <- famd.fit$var$coord %>%
as.data.frame() %>%
mutate(var=as.factor(rownames(.))) %>%
select(var, everything()) %>%
as_tibble()
rownames(res) <- NULL
p.d1 <- ggplot(res, aes(x = reorder(var, Dim.1), y = Dim.1)) +
geom_bar(stat = "identity", fill="darkblue") +
coord_flip() + # Makes it a horizontal bar chart
labs(title = "Variable importance for Dim.1", y = "Importance", x = "Variable") +
theme_minimal()
p.d2 <- ggplot(res, aes(x = reorder(var, Dim.2), y = Dim.2)) +
geom_bar(stat = "identity", fill="darkred") +
coord_flip() + # Makes it a horizontal bar chart
labs(title = "Variable importance for Dim.2", y = "Importance", x = "Variable") +
theme_minimal()
ggpubr::ggarrange(plotlist=list(p.d1, p.d2), nrow=2, ncol=1)
# Set up output directory
output.dir <- "artifacts/masks_5k"
if (!dir.exists("artifacts")) dir.create("artifacts")
if (!dir.exists(output.dir)) dir.create(output.dir)
# LOAD DATA
# Get raster data
states <- c("CO", "NC", "OR", "VT")
r.list <- purrr::map(paste0("data/final_rasters/", states, ".tif"), rast)
names(r.list) <- states
# Get observation data
obs.df <- list.files("data/final", full.names = T) %>%
purrr::map_df(readRDS) %>%
select(state, common.name, observation.point=geometry)
# BUFFERING RASTER DATA
dist <- 5e3
mask.update <- function(i, mask.raster, obs.df, obs.field="observation.point",
dist=10000, u="m") {
obs.pt <- st_transform(obs.df[i, "observation.point"], st_crs(mask.raster))
# Create a buffer around the point, ensuring correct units
buf <- st_buffer(obs.pt, dist=units::as_units(paste(dist, u)))
return(terra::rasterize(buf, mask.raster, update=T, field=1))
}
# For each observation point, you can now create a distance
# raster and then mask cells within the buffer distance
get.buffered.zones <- function(r, obs.df, obs.field="observation.point",
dist=10000, u="m") {
# Create an empty raster with the same extent and resolution as r
mask.raster <- terra::rast(r)
# Recursively update mask.raster with additional buffered regions
for(i in 1:nrow(obs.df)) {
mask.raster <- mask.update(i, mask.raster, obs.df, obs.field=obs.field, dist=dist, u=u)
gc()
}
return(mask.raster)
}
# Get masks by state, species
masks <- purrr::map(states, function(state) {
specs <- sort(unique(obs.df$common.name))
spec.masks <- purrr::map(specs, function(spec, st=state) {
fname <- file.path(output.dir, paste0(st, "_", spec, ".tif"))
if (file.exists(fname)) {
cat("Reading", spec, st, "mask from", fname, "\n")
r.mask <- rast(fname)
} else {
cat("Computing", spec, st, "mask, and saving to", fname, "\n")
r.mask <- get.buffered.zones(r=r.list[[st]],
obs.df=filter(obs.df, state == st & common.name == spec),
dist=dist)
terra::writeRaster(r.mask, fname, overwrite=T)
}
gc()
r.mask
}, .progress=T)
names(spec.masks) <- specs
spec.masks
})
## Reading Belted Kingfisher CO mask from artifacts/masks_5k/CO_Belted Kingfisher.tif
## Reading Cedar Waxwing CO mask from artifacts/masks_5k/CO_Cedar Waxwing.tif
## Reading Downy Woodpecker CO mask from artifacts/masks_5k/CO_Downy Woodpecker.tif
## Reading Ruddy Duck CO mask from artifacts/masks_5k/CO_Ruddy Duck.tif
## Reading Sanderling CO mask from artifacts/masks_5k/CO_Sanderling.tif
## Reading Sandhill Crane CO mask from artifacts/masks_5k/CO_Sandhill Crane.tif
## Reading Sharp-shinned Hawk CO mask from artifacts/masks_5k/CO_Sharp-shinned Hawk.tif
## Reading Wild Turkey CO mask from artifacts/masks_5k/CO_Wild Turkey.tif
## Reading Belted Kingfisher NC mask from artifacts/masks_5k/NC_Belted Kingfisher.tif
## Reading Cedar Waxwing NC mask from artifacts/masks_5k/NC_Cedar Waxwing.tif
## Reading Downy Woodpecker NC mask from artifacts/masks_5k/NC_Downy Woodpecker.tif
## Reading Ruddy Duck NC mask from artifacts/masks_5k/NC_Ruddy Duck.tif
## Reading Sanderling NC mask from artifacts/masks_5k/NC_Sanderling.tif
## Reading Sandhill Crane NC mask from artifacts/masks_5k/NC_Sandhill Crane.tif
## Reading Sharp-shinned Hawk NC mask from artifacts/masks_5k/NC_Sharp-shinned Hawk.tif
## Reading Wild Turkey NC mask from artifacts/masks_5k/NC_Wild Turkey.tif
## Reading Belted Kingfisher OR mask from artifacts/masks_5k/OR_Belted Kingfisher.tif
## Reading Cedar Waxwing OR mask from artifacts/masks_5k/OR_Cedar Waxwing.tif
## Reading Downy Woodpecker OR mask from artifacts/masks_5k/OR_Downy Woodpecker.tif
## Reading Ruddy Duck OR mask from artifacts/masks_5k/OR_Ruddy Duck.tif
## Reading Sanderling OR mask from artifacts/masks_5k/OR_Sanderling.tif
## Reading Sandhill Crane OR mask from artifacts/masks_5k/OR_Sandhill Crane.tif
## Reading Sharp-shinned Hawk OR mask from artifacts/masks_5k/OR_Sharp-shinned Hawk.tif
## Reading Wild Turkey OR mask from artifacts/masks_5k/OR_Wild Turkey.tif
## Reading Belted Kingfisher VT mask from artifacts/masks_5k/VT_Belted Kingfisher.tif
## Reading Cedar Waxwing VT mask from artifacts/masks_5k/VT_Cedar Waxwing.tif
## Reading Downy Woodpecker VT mask from artifacts/masks_5k/VT_Downy Woodpecker.tif
## Reading Ruddy Duck VT mask from artifacts/masks_5k/VT_Ruddy Duck.tif
## Reading Sanderling VT mask from artifacts/masks_5k/VT_Sanderling.tif
## Reading Sandhill Crane VT mask from artifacts/masks_5k/VT_Sandhill Crane.tif
## Reading Sharp-shinned Hawk VT mask from artifacts/masks_5k/VT_Sharp-shinned Hawk.tif
## Reading Wild Turkey VT mask from artifacts/masks_5k/VT_Wild Turkey.tif
names(masks) <- states
# Function to sample n points from the non-masked parts
sample.inverse.mask <- function(r.original, r.mask, n,
species, state,
sample.crs=4326, min.n=500,
output.dir="artifacts/pseudo_absence_regions") {
if (!dir.exists(output.dir)) dir.create(output.dir)
output.path <- file.path(output.dir,
gsub(" |\\-", "_",
paste0(
paste(state, tolower(species), sep="_"),
".tif")
))
if (!file.exists(output.path)) {
# Get inverse mask;
# Set NA cells to 0, keep 0 cells as 0, change other cells to 1
r.inverse <- terra::ifel(is.na(r.mask), 0, r.mask)
# Set 0 to 1 and everything else to NA
r.inverse <- terra::lapp(r.inverse, fun = function(x) ifelse(x == 0, 1, NA))
# Crop so that anything outside of the state is NA
r.cropped <- terra::crop(r.inverse, terra::ext(r.original))
# Create a binary raster from r.original where valid values are
# set to 1 and NA values remain NA
r.binary <- terra::lapp(r.original[[1]], fun = function(x) ifelse(!is.na(x), 1, NA))
# Multiply the cropped raster by the binary raster to ensure
# outside values are set to NA
r.final <- r.cropped * r.binary
terra::writeRaster(r.final, output.path, overwrite=T)
} else {
r.final <- terra::rast(output.path)
}
# Convert the raster to SpatialPoints
gdf <- terra::as.points(r.final) %>%
st_as_sf() %>%
st_transform(crs=sample.crs)
if (nrow(gdf) > 0) {
gdf <- gdf %>%
filter(!is.na(layer)) %>%
select(geometry)
} else {
return(gdf)
}
# Set to min.n size if n < min.n
if (n < min.n) n <- min.n
# Make sure there is sufficient available sample points
if (n > nrow(gdf)) n <- nrow(gdf)
# Randomly sample n points from the available (non-masked) space
sample.idx <- sample(1:nrow(gdf), n)
samples <- gdf %>%
slice(sample.idx) %>%
mutate(common.name = species,
state = state,
lon = NA,
lat = NA,
observations=0)
# Populate lon and lat values:
coords <- st_coordinates(samples)
samples$lon <- coords[, "X"]
samples$lat <- coords[, "Y"]
return(samples)
}
# Get totals by species and state
totals <- obs.df %>%
as_tibble() %>%
select(state, common.name) %>%
group_by(state, common.name) %>%
summarize(N=n(), .groups="keep")
# Create a list of pseudo absence points, by species and state,
# where the sample number `n` >= 500 | `n` == the total observed
# for each respective state and species
pseudo.absence.pts <- list()
for (st in states) {
r.original <- r.list[[st]]
r.masks <- masks[[st]]
pseudo.absence.pts[[st]] <- list()
for (spec in names(r.masks)) {
r.mask <- r.masks[[spec]]
n <- totals %>% filter(state == st & common.name == spec) %>% pull(N)
cat("Generating", n, "pseudo-absence points for the", spec, "in", st, "\n")
pseudo.absence.pts[[st]][[spec]] <- sample.inverse.mask(
r.original, r.mask, spec, st, n=n, sample.crs=4326)
cat("\tGenerated", nrow(pseudo.absence.pts[[st]][[spec]]), "/", n, "points.\n")
}
}
## Generating 4551 pseudo-absence points for the Belted Kingfisher in CO
## Generated 4551 / 4551 points.
## Generating 3446 pseudo-absence points for the Cedar Waxwing in CO
## Generated 3446 / 3446 points.
## Generating 7511 pseudo-absence points for the Downy Woodpecker in CO
## Generated 7511 / 7511 points.
## Generating 1726 pseudo-absence points for the Ruddy Duck in CO
## Generated 1726 / 1726 points.
## Generating 131 pseudo-absence points for the Sanderling in CO
## Generated 500 / 131 points.
## Generating 1532 pseudo-absence points for the Sandhill Crane in CO
## Generated 1532 / 1532 points.
## Generating 2257 pseudo-absence points for the Sharp-shinned Hawk in CO
## Generated 2257 / 2257 points.
## Generating 2611 pseudo-absence points for the Wild Turkey in CO
## Generated 2611 / 2611 points.
## Generating 4183 pseudo-absence points for the Belted Kingfisher in NC
## Generated 4183 / 4183 points.
## Generating 4195 pseudo-absence points for the Cedar Waxwing in NC
## Generated 4195 / 4195 points.
## Generating 10914 pseudo-absence points for the Downy Woodpecker in NC
## Generated 10914 / 10914 points.
## Generating 1106 pseudo-absence points for the Ruddy Duck in NC
## Generated 1106 / 1106 points.
## Generating 311 pseudo-absence points for the Sanderling in NC
## Generated 500 / 311 points.
## Generating 118 pseudo-absence points for the Sandhill Crane in NC
## Generated 500 / 118 points.
## Generating 1254 pseudo-absence points for the Sharp-shinned Hawk in NC
## Generated 1254 / 1254 points.
## Generating 2372 pseudo-absence points for the Wild Turkey in NC
## Generated 2372 / 2372 points.
## Generating 5837 pseudo-absence points for the Belted Kingfisher in OR
## Generated 5837 / 5837 points.
## Generating 8446 pseudo-absence points for the Cedar Waxwing in OR
## Generated 8446 / 8446 points.
## Generating 8576 pseudo-absence points for the Downy Woodpecker in OR
## Generated 8576 / 8576 points.
## Generating 2010 pseudo-absence points for the Ruddy Duck in OR
## Generated 2010 / 2010 points.
## Generating 258 pseudo-absence points for the Sanderling in OR
## Generated 500 / 258 points.
## Generating 2458 pseudo-absence points for the Sandhill Crane in OR
## Generated 2458 / 2458 points.
## Generating 2714 pseudo-absence points for the Sharp-shinned Hawk in OR
## Generated 2714 / 2714 points.
## Generating 2440 pseudo-absence points for the Wild Turkey in OR
## Generated 2440 / 2440 points.
## Generating 2033 pseudo-absence points for the Belted Kingfisher in VT
## Generated 2033 / 2033 points.
## Generating 3938 pseudo-absence points for the Cedar Waxwing in VT
## Generated 1169 / 3938 points.
## Generating 4635 pseudo-absence points for the Downy Woodpecker in VT
## Generated 1679 / 4635 points.
## Generating 51 pseudo-absence points for the Ruddy Duck in VT
## Generated 500 / 51 points.
## Generating 39 pseudo-absence points for the Sanderling in VT
## Generated 500 / 39 points.
## Generating 76 pseudo-absence points for the Sandhill Crane in VT
## Generated 500 / 76 points.
## Generating 748 pseudo-absence points for the Sharp-shinned Hawk in VT
## Generated 748 / 748 points.
## Generating 2181 pseudo-absence points for the Wild Turkey in VT
## Generated 2181 / 2181 points.
# Extract raster values for each point
if (!dir.exists(file.path("data", "final_pseudo_absence"))) {
dir.create(file.path("data", "final_pseudo_absence"))
}
for (state in states) {
out.file.all <- file.path("data", "final_pseudo_absence", paste0("pa_", state, ".rds"))
if (!file.exists(out.file.all)) {
r <- r.list[[state]]
cat(sprintf("Extracting points to values for %s...\n", state))
# Load observations shapefile
geo.df <- pseudo.absence.pts[[state]] %>% do.call("rbind", .)
rownames(geo.df) <- NULL
geo.df.crs <- st_crs(geo.df)
# Define target CRS and update
target.crs <- st_crs(r)
cat(sprintf("Updating CRS for %s...\n", state))
geo.df <- st_transform(geo.df, target.crs)
# Extract raster values
for (r.name in names(r)) {
cat("\tExtracting", r.name, "values for", state, "\n")
x <- terra::extract(r[[r.name]], geo.df)[[r.name]]
geo.df[[gsub(paste0("_", state), "", r.name)]] <- x
}
# Update crs back
geo.df <- st_transform(geo.df, geo.df.crs)
# Fix names; Filter NA values
geo.df <- geo.df %>%
filter(dplyr::if_all(names(.), ~!is.na(.))) %>%
suppressWarnings()
saveRDS(geo.df, out.file.all)
cat("--------------\n")
}
}
# Get all pseudo-absence data
abs.df <- list.files(file.path("data", "final_pseudo_absence"), full.names = T) %>%
purrr::map_df(readRDS) %>%
select(state, common.name, observation.point=geometry)
# There might be some slight differences since there are occasionally NA values
abs.df %>%
as_tibble() %>%
select(state, common.name) %>%
group_by(state, common.name) %>%
summarize(psuedo.absence.N=n(), .groups="keep") %>%
left_join(totals, by=c("state", "common.name")) %>%
rename(observed.N = N) %>%
knitr::kable()
| state | common.name | psuedo.absence.N | observed.N |
|---|---|---|---|
| CO | Belted Kingfisher | 4523 | 4551 |
| CO | Cedar Waxwing | 3431 | 3446 |
| CO | Downy Woodpecker | 7440 | 7511 |
| CO | Ruddy Duck | 1715 | 1726 |
| CO | Sanderling | 497 | 131 |
| CO | Sandhill Crane | 1524 | 1532 |
| CO | Sharp-shinned Hawk | 2241 | 2257 |
| CO | Wild Turkey | 2597 | 2611 |
| NC | Belted Kingfisher | 4042 | 4183 |
| NC | Cedar Waxwing | 4059 | 4195 |
| NC | Downy Woodpecker | 10415 | 10914 |
| NC | Ruddy Duck | 1076 | 1106 |
| NC | Sanderling | 488 | 311 |
| NC | Sandhill Crane | 489 | 118 |
| NC | Sharp-shinned Hawk | 1211 | 1254 |
| NC | Wild Turkey | 2261 | 2372 |
| OR | Belted Kingfisher | 5803 | 5837 |
| OR | Cedar Waxwing | 8405 | 8446 |
| OR | Downy Woodpecker | 8529 | 8576 |
| OR | Ruddy Duck | 1996 | 2010 |
| OR | Sanderling | 496 | 258 |
| OR | Sandhill Crane | 2443 | 2458 |
| OR | Sharp-shinned Hawk | 2696 | 2714 |
| OR | Wild Turkey | 2429 | 2440 |
| VT | Belted Kingfisher | 1956 | 2033 |
| VT | Cedar Waxwing | 1098 | 3938 |
| VT | Downy Woodpecker | 1598 | 4635 |
| VT | Ruddy Duck | 490 | 51 |
| VT | Sanderling | 493 | 39 |
| VT | Sandhill Crane | 492 | 76 |
| VT | Sharp-shinned Hawk | 730 | 748 |
| VT | Wild Turkey | 2090 | 2181 |
TODO: Include pseudo-absence data
stratified.split.idx <- function(df, p=0.7, lat.lon.bins=25) {
# Cut along lat/lon values to create grids (lat.bin & lon.bin)
# lat.lon.bins is the number of divisions you want
df$lat.bin <- cut(df$lat, breaks=lat.lon.bins, labels = F)
df$lon.bin <- cut(df$lon, breaks=lat.lon.bins, labels = F)
# Create a new variable combining the stratification variables
df %>%
mutate(strata = paste(lat.bin, lon.bin, common.name, state)) %>%
pull(strata) %>%
# Create the data partitions
createDataPartition(., p = p, list = F) %>%
suppressWarnings()
}
prepare.data <- function(df, p=.7, lat.lon.bins=25) {
train.index <- stratified.split.idx(df, p=p, lat.lon.bins = lat.lon.bins)
df.train <- df[train.index, ]
df.test <- df[-train.index, ]
list(train = df.train,
test = df.test,
index = train.index)
}
train.test <- prepare.data(df, .7)
train <- df[train.test$index,]
test <- df[-train.test$index,]
Each of the 20 different Land Cover Categories falls under a “parent” category (see National Land Cover Database Class Legend and Description).